"""
#
# Flow stability for dynamic community detection https://arxiv.org/abs/2101.06131v2
#
# Copyright (C) 2021 Alexandre Bovet <alexandre.bovet@maths.ox.ac.uk>
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
# details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.


creates the dataframe with articles doi and their publication dates and saves it as
`df_doi_date.csv.gz`

"""

import json
import os
import pandas as pd
from multiprocessing import Pool


#location of the APS metadata available at https://journals.aps.org/datasets

datadir = '../paper_data/aps/aps-dataset-metadata-2018/'

# raise Exception

#%% first get doi and date for each paper


files = []
for journ in sorted(os.listdir(datadir)):
    for vol in sorted(os.listdir(os.path.join(datadir, journ))):
          files.extend(sorted([os.path.join(datadir, journ, vol, f) for f in \
                         os.listdir(os.path.join(datadir, journ, vol))]))
    
        

def worker(file):

    print(os.getpid(), file)
    
    with open(file, 'r') as fopen:
        article = json.load(fopen)

        
    if article.get('articleType') == 'article':
        return (file, article.get('id'), article.get('date'))
    else:
        return (file, None, None)
    
        
#%% 
if __name__ == '__main__':
    
    with Pool(4) as p:
        res = p.map(worker, files, chunksize=1)
    
    df_doi_dates = pd.DataFrame(data=res, columns=['file','doi','date'])
    
    df_doi_dates.to_csv('../data/aps/df_doi_date.csv.gz')
    
